This document summarizes the microbiota analyses for the FEMAP+ACV study. There were 12 patients that participated in the study, but only 10 were complete with both “pre” and “post” samples (prior to and following 3 months of acetate supplementation, respectively). The two that were incomplete and not included in downstream analyses (at this time) were patients 003 and 004.
library(ALDEx2)
library(ARTool)
library(broom)
library(dplyr)
library(forcats)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(knitr)
library(Maaslin2)
library(microbiome)
library(patchwork)
library(phyloseq)
library(plyr)
library(RColorBrewer)
library(reshape2)
library(rlist)
library(sjmisc)
library(stringr)
library(tidyr)
library(zCompositions)
counts <- read.table("data/cutadapt_counts2.txt",
header = TRUE, row.names = 1, sep = "\t", check.names = FALSE,
quote = "", stringsAsFactors = FALSE)
tax <- read.table("data/cutadapt_tax2.txt",
header = TRUE, row.names = 1, sep = "\t", check.names = FALSE,
quote = "", stringsAsFactors = FALSE)
meta<-read.table("data/metadata.txt", header=T, row.names = 1, sep='\t', comment.char = "")
# make a phyloseq object
tax<-as.matrix(tax)
OTU = otu_table(counts, taxa_are_rows = FALSE)
TAX = tax_table(tax)
physeq = phyloseq(OTU, TAX)
From phyloseq R package, calculate various alpha diversity measures
div<- estimate_richness(physeq, split = TRUE, measures = NULL)
From microbiome R package, calculate various dominance indices
dom<- dominance(physeq)
Merge the data
div_all <- data.frame(div, dom)
rownames(div_all)<-gsub("X","", rownames(div_all))
div_merge<-merge(div_all, meta, by=0, all=FALSE)
div_merge2<-subset(div_merge, !is.na(timepoint))
div_merge2$participant<-as.factor(div_merge2$participant)
div_merge2<-div_merge2[!((div_merge2$Row.names) %in% c("003_post", "004_pre")),]
#write the alpha diversity measures into a new file
#write.table(div_merge2, file="data/alpha_diversity.txt", sep='\t', quote=F)
Now lets do some stats to compare pairwise between pre and post samples
# pairwise t-tests
# Select columns to test
cols_to_test <- c(2,3,5, 7:14, 16:17)
columns_to_test <- names(div_merge2)[cols_to_test]
tp_pre <- grep("pre$", unique(div_merge2$timepoint), value = TRUE)
tp_post <- grep("post$", unique(div_merge2$timepoint), value = TRUE)
results <- data.frame()
for (col in columns_to_test) {
pre_df <- div_merge2[div_merge2$timepoint == tp_pre, c("participant", col)]
post_df <- div_merge2[div_merge2$timepoint == tp_post, c("participant", col)]
merged <- merge(pre_df, post_df, by = "participant", suffixes = c("_pre", "_post"))
pre_vals <- merged[[paste0(col, "_pre")]]
post_vals <- merged[[paste0(col, "_post")]]
# paired Wilcoxon signed-rank test
test_result <- wilcox.test(pre_vals, post_vals, paired = TRUE, exact = FALSE)
# save results
results <- rbind(
results,
data.frame(
Variable = col,
W_statistic = test_result$statistic,
p_value = test_result$p.value
)
)
}
kable(results)
| Variable | W_statistic | p_value | |
|---|---|---|---|
| V | Observed | 36 | 0.4148231 |
| V1 | Chao1 | 34 | 0.5408179 |
| V2 | ACE | 36 | 0.4148231 |
| V3 | Shannon | 35 | 0.4755327 |
| V4 | Simpson | 33 | 0.6102987 |
| V5 | InvSimpson | 32 | 0.6834809 |
| V6 | Fisher | 37 | 0.3589514 |
| V7 | dbp | 23 | 0.6834809 |
| V8 | dmn | 25 | 0.8384638 |
| V9 | absolute | 27 | 1.0000000 |
| V10 | relative | 23 | 0.6834809 |
| V11 | core_abundance | 21 | 0.5408179 |
| V12 | gini | 15 | 0.2212718 |
So there are no significant trends in alpha diversity pre- vs. post-intervention.
What about if we take into account the metabolic responder status? We can do this with a 2-way ANOVA.
# perform an Aligned Rank Transform (nonparametric alternative to 2-way ANOVA)
# Create the 'participant_id' column to group by
div_merge2$participant_id <- sub("_.*", "", div_merge2$Row.names)
div_merge2$participant_id<-as.factor(div_merge2$participant_id)
div_merge2$timepoint <- as.factor(div_merge2$timepoint)
div_merge2$metabolic_responder <- as.factor(div_merge2$metabolic_responder)
art_aov_results <- data.frame()
for (col in columns_to_test) {
f <- as.formula(paste0(col, " ~ timepoint * metabolic_responder + Error(participant_id)"))
art_model <- art(f, data = div_merge2)
anova_res <- anova(art_model)
anova_df <- tidy(anova_res)
anova_df <- anova_df %>% filter(Term == "timepoint:metabolic_responder")
anova_df$Metric <- col
anova_df$term <- NULL
anova_df <- anova_df %>% relocate(Metric)
art_aov_results <- rbind(art_aov_results, anova_df)
}
## Warning: The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
## The column names Term, Error, Df.res, and Sum.Sq.res in ANOVA output were not
## recognized or transformed.
kable(art_aov_results)
| Metric | Term | Error | df | Df.res | sumsq | Sum.Sq.res | meansq | statistic | p.value |
|---|---|---|---|---|---|---|---|---|---|
| Observed | timepoint:metabolic_responder | Within | 1 | 8 | 145.8 | 206.65 | 145.8 | 5.6443262 | 0.0448401 |
| Chao1 | timepoint:metabolic_responder | Within | 1 | 8 | 145.8 | 188.40 | 145.8 | 6.1910828 | 0.0376283 |
| ACE | timepoint:metabolic_responder | Within | 1 | 8 | 145.8 | 199.00 | 145.8 | 5.8613065 | 0.0417840 |
| Shannon | timepoint:metabolic_responder | Within | 1 | 8 | 7.2 | 99.60 | 7.2 | 0.5783133 | 0.4687921 |
| Simpson | timepoint:metabolic_responder | Within | 1 | 8 | 3.2 | 134.80 | 3.2 | 0.1899110 | 0.6745105 |
| InvSimpson | timepoint:metabolic_responder | Within | 1 | 8 | 1.8 | 131.40 | 1.8 | 0.1095890 | 0.7491169 |
| Fisher | timepoint:metabolic_responder | Within | 1 | 8 | 156.8 | 159.00 | 156.8 | 7.8893082 | 0.0228856 |
| dbp | timepoint:metabolic_responder | Within | 1 | 8 | 0.2 | 227.00 | 0.2 | 0.0070485 | 0.9351550 |
| dmn | timepoint:metabolic_responder | Within | 1 | 8 | 1.8 | 152.00 | 1.8 | 0.0947368 | 0.7661004 |
| absolute | timepoint:metabolic_responder | Within | 1 | 8 | 7.2 | 195.60 | 7.2 | 0.2944785 | 0.6021506 |
| relative | timepoint:metabolic_responder | Within | 1 | 8 | 0.2 | 227.00 | 0.2 | 0.0070485 | 0.9351550 |
| core_abundance | timepoint:metabolic_responder | Within | 1 | 8 | 33.8 | 147.00 | 33.8 | 1.8394558 | 0.2120499 |
| gini | timepoint:metabolic_responder | Within | 1 | 8 | 33.8 | 87.00 | 33.8 | 3.1080460 | 0.1159221 |
Several metrics have P < 0.05, so there may be a responder-specific trend in alpha diversity changes to the intervention.
fisher<-ggplot(div_merge2, aes(x=timepoint, y=Fisher, group = metabolic_responder, colour = metabolic_responder)) +
geom_point(aes(fill=metabolic_responder), size=4, shape=21, stroke=1, alpha=0.5) +
geom_line(aes(group = participant), size = 1) +
geom_text_repel(aes(label=participant), size=3) +
scale_color_brewer(palette="Paired") +
scale_fill_brewer(palette="Paired") +
labs(y="Fisher's alpha ") +
scale_x_discrete(labels = c("Pre", "Post")) +
theme_bw() +
theme(axis.title.x = element_blank(), axis.title=element_text(size=10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fisher
# what are the dimensions of the original counts file?
dim(counts)
## [1] 23 2418
# Combine the counts and tax tables into one tidier working file
t.counts<-t(counts)
ct<-merge(t.counts, tax, by="row.names")
rownames(ct)<-ct$Row.names
ct$Row.names <- NULL
ct$Sequence <- NULL
ct<-unite(ct,"tax.vector", Kingdom:Species, sep=':', remove=TRUE)
dim(ct)
## [1] 2418 24
Check to see how many samples contain > 1000 total reads.
ct2<-ct[,1:ncol(ct)-1]
i <- (colSums(ct2) <=1000)
d.s <- ct2[, !i]
dim(d.s)
## [1] 2418 23
ncol(ct2)-ncol(d.s)
## [1] 0
So all the samples have > 1000 reads. Now lets apply some frequency-based cutoffs.
d.freq <- apply(d.s, 2, function(x) {x/sum(x)})
#keep SVs > 0.1% in any sample
d.0 <- d.s[apply(d.freq, 1, max)>0.001,]
dim(d.0)
## [1] 767 23
You can save tables with these filtering cutoffs.
# make relative abundance table
d.freq0 <- data.frame(apply(d.0, 2, function(x){x/sum(x)}))
#add taxonomy back in and save the filtered counts file
d.0$tax.vector = ct$tax.vector[match(rownames(d.0), rownames(ct))]
#write.table(d.0, file="data/dada2_tax_counts_001filt.txt", sep='\t', quote=F)
#add taxonomy back in and save the filtered abundance file
d.freq0$tax.vector = ct$tax.vector[match(rownames(d.freq0), rownames(ct))]
#write.table(d.freq0, file="data/dada2_tax_freq_001filt.txt", sep='\t', quote=F)
I also want to remove very rare/sparse SVs.
#filter SVs based on a read count cutoff
count = 100
d.2 <- data.frame(d.s[which(apply(d.s, 1, function(x){sum(x)}) > count),])
dim(d.2)
## [1] 516 23
Lets combine d.freq0 and d.2 so that we have a filtered table that contains only SVs present at 0.1% abundance AND >100 reads across all samples.
row_list<-intersect(rownames(d.0), rownames(d.2))
length(row_list)
## [1] 511
d_filt<-d.s[rownames(d.s) %in% row_list,]
#add taxonomy back in and save the filtered counts file
d_filt$tax.vector = ct$tax.vector[match(rownames(d_filt), rownames(ct))]
#write the file
#write.table(d_filt, file="data/counts_01abun_100filt_2.txt", sep='\t', quote=F)
d_filt_freq<-d.s[rownames(d.s) %in% row_list,]
d_filt_freq<-data.frame(apply(d_filt_freq, 2, function(x){x/sum(x)}))
d_filt_freq$tax.vector = ct$tax.vector[match(rownames(d_filt_freq), rownames(ct))]
#write.table(d_filt_freq, file="data/abundance_01abun_100filt.txt", sep='\t', quote=F)
These files will be the ones we primarily use downstream.
# Read input data
#d <- read.table("data/counts_01abun_100filt.txt", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
#OR, rename the table made in the previous section to 'd'
d<-d_filt
# Extract and clean the taxonomic information
taxa <- data.frame(str_split_fixed(d$tax.vector, ":", 7))
taxa$Genus <- paste(taxa$X2, taxa$X3, taxa$X4, taxa$X5, taxa$X6, sep="_")
# Merge cleaned taxa info with count data
d1 <- cbind(d[,1:(ncol(d)-1)], Genus=taxa$Genus)
# Aggregate to the genus level
gen <- ddply(d1, "Genus", numcolwise(sum))
row.names(gen) <- gen$Genus
gen$Genus <- NULL
# Calculate the relative abundance
gen.f <- apply(gen, 2, function(x) x / sum(x))
# Order by abundance
y1 <- gen.f[order(rowSums(gen.f), decreasing = TRUE),]
We dont want to plot all the very rare taxa (below 1% abundance). We will put those in a separate group called the remainder.
# Filter taxa above 1% abundance
abund <- 0.01
keep.taxa.index <- rownames(y1[rowMeans(y1) > abund,])
# Retain only the relevant taxa and calculate the remainder
y3 <- as.data.frame(y1) %>% filter(rownames(.) %in% keep.taxa.index)
remainder <- colSums(y1[!rownames(y1) %in% keep.taxa.index, , drop = FALSE])
y3 <- rbind(y3, remainder)
rownames(y3)[nrow(y3)] <- "remainder"
# Remove the controls
y3<-as.data.frame(y3)
y3 <- dplyr::select(y3, -"DNA_BLANK2", -"PCR_BLANK2", -"POS")
# Convert to matrix and reshape for plotting
y3 <- as.matrix(y3)
melted <- melt(y3)
colnames(melted) <- c("Genus", "Sample", "value")
# Shorten genus names
gen_name <- data.frame(str_split_fixed(melted$Genus, "_", 5))
melted$Genus <- ifelse(gen_name$X5 == "", "Genera <1% abundance", gen_name$X5)
# Custom sample ordering
custom_order <- function(samples) {
df <- data.frame(
Sample = samples,
Participant = gsub("_(pre|post)", "", samples),
Visit = ifelse(grepl("_pre", samples), "A", "B")
) %>% arrange(Participant, Visit)
return(df$Sample)
}
melted$Sample <- factor(melted$Sample, levels = custom_order(unique(melted$Sample)))
Make the plot
# Define color palettes
pal1 <- rep(c("lightgray", "#006F6B", "#00ADAB", "#ACDEE0", "#3E783A", "#76BB47", "#AAD486",
"#CBC02D", "#FCF281", "#C36928", "#F58123", "#F8A96E", "#F9DDCA", "#851719",
"#B01F24", "#ED2027", "#F3786D", "#4A2970", "#7E6BAD", "#B8ACD1", "#A71D47",
"#ED1F6B"))
pal2 <- rlist::list.reverse(pal1)
# Plot with faceting
melted$Sample <- gsub("X", "", melted$Sample)
melted$Participant <- sub("_.*", "", melted$Sample)
melted$Timepoint <- sub(".*_", "", melted$Sample)
melted$Timepoint <- gsub("pre", "Pre", melted$Timepoint)
melted$Timepoint <- gsub("post", "Post", melted$Timepoint)
melted$Timepoint <- factor(melted$Timepoint, levels = c("Pre", "Post"))
bars <- ggplot(melted, aes(x=Timepoint, y=value, fill=fct_reorder(Genus, value))) +
geom_bar(stat="identity", position="stack") +
scale_fill_manual(values=pal2) +
facet_grid(~Participant, scales="free_x") +
ylab("Relative Abundance") +
guides(fill = guide_legend(ncol=2, reverse=T, title="Genus")) +
theme_bw() +
theme(axis.text.x=element_text(size=8), legend.position="right", axis.title.x=element_blank(), legend.text=element_text(size=8, face="italic"), legend.title=element_text(size=10), axis.title=element_text(size=10))
bars
tax1 <- d$tax.vector
# Define the columns to remove (controls)
cols_to_remove <- c("DNA_BLANK2", "PCR_BLANK2", "POS")
# Remove the columns if they exist in the data frame
d <- d[, !(colnames(d) %in% cols_to_remove)]
# Split to the 6th taxonomic level -> genus (separated by :)
split6 <- sapply(strsplit(as.character(tax1), ":"), "[", 6)
split6 <- as.data.frame(split6)
rownames(split6)<-rownames(d)
split6$sv<-rownames(d)
split6$sv_split6<-paste(split6$sv, split6$split6, sep='_')
rownames(split6)<-rownames(d)
#get only the count columns
dm <- d[,1:ncol(d)-1]
gen.f <- apply(dm, 2, function(x) {x/sum(x)})
colSums(gen.f) #check all sum to 1
## 001_post 001_pre 002_post 002_pre 005_post 005_pre 006_post 006_pre
## 1 1 1 1 1 1 1 1
## 007_post 007_pre 008_post 008_pre 009_post 009_pre 010_post 010_pre
## 1 1 1 1 1 1 1 1
## 011_post 011_pre 012_post 012_pre
## 1 1 1 1
Apply compositional data transformation (CZM method) and CLR transformation.
d.czm <- cmultRepl(t(gen.f), label=0, method="CZM")
## No. adjusted imputations: 2950
d.clr <- t(apply(d.czm, 1, function(x){log(x) - mean(log(x))}))
Compute Aitchison distances, which will be used in Fig 2C and D.
aitch_dists <- as.matrix(dist(d.clr))
#write.table(aitch_dists, file="data/aitchdist_filtered.txt", sep="\t", quote=F)
Perform PCA
d.pcx <- prcomp(d.clr)
# Define parameters for PCA
sv_positions <- data.frame(d.pcx[["rotation"]])
# Merge on genus table
sv_pos <- merge(sv_positions, split6, by = 0)
# Calculate Euclidean distance
arrow_len <- function(x, y) {
sqrt((x - 0)^2 + (y - 0)^2)
}
sv_pos_dist <- mapply(arrow_len, sv_pos$PC1, sv_pos$PC2)
sv_pos_dist <- as.data.frame(cbind(sv_pos_dist, sv_pos$Row.names, sv_pos$sv_split6, sv_pos$PC1, sv_pos$PC2))
colnames(sv_pos_dist) <- c("Distance", "SV", "Genus", "PC1", "PC2")
# Filter arrow based on distance
filter <- sv_pos_dist %>% filter(Distance >= 0.15)
d.mvar <- sum(d.pcx$sdev^2)
# Calculate the PC1 and PC2 variance
PC1 <- paste("PC1: ", round(sum(d.pcx$sdev[1]^2)/d.mvar, 3))
PC2 <- paste("PC2: ", round(sum(d.pcx$sdev[2]^2)/d.mvar, 3))
metadata<-tibble::rownames_to_column(meta, "SampleID")
loadings<- data.frame(Variables=rownames(d.pcx$rotation), d.pcx$rotation)
values<-merge(d.pcx$x[,c(1,2)], metadata[,c("SampleID","participant","timepoint","metabolic_responder")],
by.x="row.names", by.y="SampleID", all=F)
# Remove unwanted rows
values2 <- values[!(values$Row.names %in% c("003_post", "004_pre","DNA_BLANK2","PCR_BLANK2")), ]
values2$time<-gsub("._","",values2$timepoint)
# Convert participant to a factor so it's treated as discrete
values2$participant <- as.factor(values2$participant)
Make the plot!
p <- ggplot(values2, aes(x = PC1, y = PC2)) +
geom_segment(data = sv_pos, aes(x = 0, y = 0, xend = 50 * PC1, yend = 50 * PC2),
arrow = arrow(length = unit(1/2, 'picas')),
color = "grey69", alpha = 0.8, size = 0.15) + # Plot features
geom_text(data = filter, aes(x = 50 * as.numeric(PC1), y = 50 * as.numeric(PC2)),
nudge_x = 0.25, nudge_y = 0.25, label = filter$Genus,
color = "grey69", size = 3, fontface = "italic") +
geom_point(aes(colour = participant), size = 6) + # Points colored by participant (Viridis)
geom_text(aes(label = time), size = 2.5) +
scale_color_brewer(palette="Paired") +
xlab(paste0("PC1: ", round(100 * (d.pcx$sdev[1]^2 / sum(d.pcx$sdev^2)), 1), "%")) +
ylab(paste0("PC2: ", round(100 * (d.pcx$sdev[2]^2 / sum(d.pcx$sdev^2)), 1), "%")) +
labs(colour="Participant") +
theme_bw() +
theme(legend.position = c(0.1, 0.67), legend.background = element_blank(), legend.key = element_blank(), axis.title=element_text(size=10), legend.title = element_text(size=10))
p
#dm<-read.table("data/aitchdist_filtered.txt", sep='\t', header=TRUE, row.names=1, quote="")
#OR
dm<-aitch_dists
#remove the X from the colnames, if they're there
#colnames(dm)<-gsub("X","", colnames(dm))
intra_values <- list()
# Loop through the dm matrix to capture the filtered values
for(i in 1:nrow(dm)) {
for(j in 1:ncol(dm)) {
currentrow <- unlist(strsplit(rownames(dm)[i], "_"))
currentcol <- unlist(strsplit(colnames(dm)[j], "_"))
# Apply the condition: same identifier (currentrow[1] == currentcol[1]), pre in row, not equal in the second part
if(currentrow[1] == currentcol[1]
&& currentrow[2] != currentcol[2]
&& currentrow[2] == "pre") {
# Store the value into the list
intra_values[[currentrow[1]]] <- c(intra_values[[currentrow[1]]], dm[i, j])
}
}
}
# Convert the list into a data frame and set appropriate column names
intra_df <- as.data.frame(intra_values)
time_test<-t(intra_df)
colnames(time_test)<-"aitch"
rownames(time_test)<-gsub("X","", rownames(time_test))
rownames(time_test)<-paste0(rownames(time_test), "_post")
dist_merge<-merge(meta, time_test, by=0, all=FALSE)
dist_merge$participant<-as.factor(dist_merge$participant)
aitch_plot <- ggplot(dist_merge, aes(x = participant, y = aitch, fill = factor(participant))) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Paired") +
labs(y = "Pre vs. Post Aitchison Distance", x = "Participant") +
theme_bw() +
theme(legend.position = "none", axis.title = element_text(size = 10), axis.ticks.x = element_blank(),
axis.text.x = element_blank()) + # Remove x-axis labels
geom_text(aes(x = participant, y = 2, label = participant),
color = "white", size = 3, inherit.aes = FALSE) # Adjust size as needed
aitch_plot
interindiv_pre_values <- list()
# Loop through the dm matrix to capture the filtered values
for(i in 1:nrow(dm)) {
for(j in 1:ncol(dm)) {
currentrow <- unlist(strsplit(rownames(dm)[i], "_"))
currentcol <- unlist(strsplit(colnames(dm)[j], "_"))
# Apply the condition: same identifier (currentrow[1] == currentcol[1]), pre in row, not equal in the second part
if(currentrow[1] != currentcol[1]
&& currentrow[2] == currentcol[2]
&& currentrow[2] == "pre") {
# Store the value into the list
interindiv_pre_values[[currentrow[1]]] <- c(interindiv_pre_values[[currentrow[1]]], dm[i, j])
}
}
}
interindiv_post_values <- list()
# Loop through the dm matrix to capture the filtered values
for(i in 1:nrow(dm)) {
for(j in 1:ncol(dm)) {
currentrow <- unlist(strsplit(rownames(dm)[i], "_"))
currentcol <- unlist(strsplit(colnames(dm)[j], "_"))
# Apply the condition: same identifier (currentrow[1] == currentcol[1]), post in row, not equal in the second part
if(currentrow[1] != currentcol[1]
&& currentrow[2] == currentcol[2]
&& currentrow[2] == "post") {
# Store the value into the list
interindiv_post_values[[currentrow[1]]] <- c(interindiv_post_values[[currentrow[1]]], dm[i, j])
}
}
}
# Convert lists to data frames
intra_df <- data.frame(Distance = unlist(intra_values), Category = "Intra")
inter_pre_df <- data.frame(Distance = unlist(interindiv_pre_values), Category = "Inter-Pre")
inter_post_df <- data.frame(Distance = unlist(interindiv_post_values), Category = "Inter-Post")
distance_data <- bind_rows(intra_df, inter_pre_df, inter_post_df) # Combine data frames
distance_data$Category <- factor(distance_data$Category, levels = c("Intra", "Inter-Pre", "Inter-Post"))
dist_comp<-ggplot(distance_data, aes(x = Category, y = Distance)) +
geom_violin(trim = FALSE, fill="lightgrey", alpha = 0.5) +
geom_boxplot(width = 0.2, outlier.shape = NA, color = "black") + # Add boxplot inside violin
theme_bw() +
#scale_fill_manual(values = c("Intra" = "#1f77b4", "Inter-Pre" = "#ff7f0e", "Inter-Post" = "#2ca02c")) + # Custom colors
labs(y = "Aitchison Distance") +
theme(legend.position = "none", text = element_text(size = 10), axis.title.x = element_blank())
dist_comp
#div<-read.table("data/alpha_diversity.txt", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
#OR
div<-div_merge2
div$participant<-as.factor(div$participant)
div<-div[!((div$Row.names) %in% c("003_post", "004_pre")),]
shan<-ggplot(div, aes(x=timepoint, y=Shannon, group = participant, colour = participant)) +
geom_point(aes(fill=participant), size=4, shape=21, stroke=1, alpha=0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette="Paired") +
scale_fill_brewer(palette="Paired") +
labs(y="Shannon's Index") +
scale_x_discrete(labels = c("Pre", "Post")) +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position ="none", axis.title=element_text(size=10))
shan
core<-ggplot(div, aes(x=timepoint, y=core_abundance, group = participant, colour = participant)) +
geom_point(aes(fill=participant), size=4, shape=21, stroke=1, alpha=0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette="Paired") +
scale_fill_brewer(palette="Paired") +
labs(y="Core Abundance") +
scale_x_discrete(labels = c("Pre", "Post")) +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position="none", axis.title=element_text(size=10))
core
lay<-rbind(c(1,1,1,1),c(1,1,1,1),c(2,2,3,4),c(2,2,5,6))
grid.arrange(bars, p, aitch_plot, dist_comp, shan, core, layout_matrix=lay)
Specify the comparison groups of samples:
pre<-c("001_pre","002_pre","005_pre","006_pre","007_pre","008_pre","009_pre","010_pre","011_pre","012_pre")
post<-c("001_post","002_post","005_post","006_post","007_post","008_post","009_post","010_post","011_post","012_post")
Perform an aldex2 t-test and effect size test of paired pre- vs post-intervention samples.
aldex.in<-d[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
#merge the data
x.all <- data.frame(x.tt, x.effect)
tax<-as.data.frame(tax)
#merge taxonomic information
x.all.tax.sv <- merge(x.all, dplyr::select(tax, 2:7), by = "row.names", all.y = FALSE)
#write a .txt file with the results
#write.table(x.all.tax, file="aldex_output/pre_vspost_SV.txt", sep="\t", quote=F, col.names=NA)
Make the heatmap
#aldex_out<-read.table("aldex_output/pre_vspost_SV.txt", sep="\t", quote="", header=T, row.names=1)
#OR
aldex_out<-x.all.tax.sv
filt_effect <- aldex_out[abs(aldex_out$effect) > 0.5, ]
# relative abundance
abun <- apply(aldex.in, 2, function(x) {x/sum(x)})
colSums(abun) #check all sum to 1
## 001_pre 002_pre 005_pre 006_pre 007_pre 008_pre 009_pre 010_pre
## 1 1 1 1 1 1 1 1
## 011_pre 012_pre 001_post 002_post 005_post 006_post 007_post 008_post
## 1 1 1 1 1 1 1 1
## 009_post 010_post 011_post 012_post
## 1 1 1 1
#transpose to samples as rows
abun.t<-t(abun)
d.czm <- cmultRepl(abun.t, label=0, method="CZM", z.warning = 0.95)
## No. adjusted imputations: 6601
# The table needs to be transposed again (samples as COLUMNS)
d.clr <- t(apply(d.czm, 1, function(x){log(x) - mean(log(x))}))
#only get the differentially abundant SV
d.clr.diff<-d.clr[,colnames(d.clr) %in% filt_effect$Row.names]
clr_melt<-melt(d.clr.diff)
colnames(clr_melt)<-c("sample","SV","clr")
#add a participant and timepoint column
clr_melt$participant<-gsub("_.*","", clr_melt$sample)
clr_melt$timepoint<-gsub(".*_","",clr_melt$sample)
clr_melt$timepoint<-gsub("pre","1pre",clr_melt$timepoint)
filt_effect <- filt_effect %>%
mutate(bugnames = ifelse(!is.na(Species), paste(Row.names, Genus, Species, sep = "_"), paste(Row.names, Genus, sep = "_")))
filt_effect<- filt_effect %>% arrange(effect)
clr_final <- clr_melt %>%
left_join(filt_effect %>% dplyr::select(Row.names, bugnames), by = c("SV" = "Row.names"))
clr_final <- clr_final %>%
left_join(filt_effect %>% dplyr::select(Row.names, effect), by =c("SV"="Row.names"))
clr_final$effect<-clr_final$effect * -1
clr_final <- clr_final %>%
arrange(effect) %>%
mutate(bugnames = factor(bugnames, levels = unique(bugnames))) # Preserve order in factor
filt_effect$effect<-filt_effect$effect *-1
filt_effect<-filt_effect %>% mutate(bugnames = factor(bugnames, levels = unique(bugnames)))
Make the plot
heat <- ggplot(clr_final, aes(x = timepoint, y = bugnames, fill = clr)) +
geom_tile() +
facet_grid(~participant, scales = "free_x") +
scale_fill_gradientn(
colors = rev(c("#98033A","#D43747","#F76340","#FEA55C","#FEFFBA","white","#9CDA9C","#267AB1","#5a78b5","#554394")),
values = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), # Keeping the same values
limits = range(clr_final$clr)) +
scale_x_discrete(labels=c("1pre"="Pre", "post"="Post")) +
theme_bw() +
theme(axis.title = element_blank(),
legend.position = c(-0.2, 1.05),
legend.direction = "horizontal",
axis.text.y=element_text(face="italic"),
axis.text.x = element_text(size=8))
heat
This heatmap will look slightly different every time a new aldex ttest and effect size test is run, because the results are based on estimates.
bars <- ggplot(filt_effect, aes(x = effect, y = bugnames, fill = ifelse(effect > 0.5, "salmon", "#5a78b5"))) +
geom_col() +
scale_fill_identity() + # Directly apply colors without expecting discrete categories
geom_text(aes(label = round(effect, 2), x = ifelse(effect > 0.5, 0.1, -0.1)), # Position text dynamically
color = "black", hjust = ifelse(filt_effect$effect > 0.5, 0, 1), size = 3) + # Adjust text alignment
scale_y_discrete(limits = rev(unique(filt_effect$bugnames))) + # Keep order consistent with heatmap
theme_minimal() +
xlab("Effect Size") +
theme(
axis.text.y = element_blank(), # Hide y-axis text to avoid duplication
axis.title.y = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank())
bars
final_plot<- heat + bars + plot_layout(widths = c(4, 1))
final_plot
Perform an aldex2 t-test and effect size test of paired pre- vs post-intervention samples on microbial functional output tables.
First up, EC numbers.
ec<-read.table("data/pred_metagenome_unstrat_EC.tsv", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
ec<-round(ec, digits=0)
# pre vs post all samples
aldex.in.ec<-ec[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x.ec <- aldex.clr(aldex.in.ec, conds, mc.samples=128, verbose=TRUE)
x.tt.ec <- aldex.ttest(x.ec, paired.test=TRUE)
x.effect.ec <- aldex.effect(x.ec)
#merge the data
x.all.ec <- data.frame(x.tt.ec, x.effect.ec)
#write.table(x.all.ec, file="aldex_output/prevspost_ECnumbers.txt", sep='\t', quote=F)
Next up, KOs.
ko<-read.table("data/pred_metagenome_unstrat_KO.tsv", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
ko<-round(ko, digits=0)
# pre vs post all samples
aldex.in.ko<-ko[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x.ko <- aldex.clr(aldex.in.ko, conds, mc.samples=128, verbose=TRUE)
x.tt.ko <- aldex.ttest(x.ko, paired.test=TRUE)
x.effect.ko <- aldex.effect(x.ko)
#merge the data
x.all.ko <- data.frame(x.tt.ko, x.effect.ko)
#write.table(x.all.ko, file="aldex_output/prevspost_KOnumbers.txt", sep='\t', quote=F)
And finally, at the functional pathway level.
p<-read.table("data/path_abun_unstrat.tsv", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
p<-round(p, digits=0)
# pre vs post all samples
aldex.in.p<-p[,c(pre, post)]
conds<-c(rep("pre", length(pre)), rep("post", length(post)))
x.p <- aldex.clr(aldex.in.p, conds, mc.samples=128, verbose=TRUE)
x.tt.p <- aldex.ttest(x.p, paired.test=TRUE)
x.effect.p <- aldex.effect(x.p)
#merge the data
x.all.p <- data.frame(x.tt.p, x.effect.p)
#write.table(x.all.p, file="aldex_output/prevspost_pathways.txt", sep='\t', quote=F)
Now lets make some volcano plots.
x.all.ec$effect<-x.all.ec$effect * -1
ec_num<-x.all.ec
#colours based on significance!
# add a column
ec_num$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
ec_num$diffexpressed[ec_num$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
ec_num$diffexpressed[ec_num$effect < -0.5] <- "DOWN"
#labels based on significance!
ec_num$delabel <- NA
#ec_num$delabel[ec_num$diffexpressed != "NO"] <- rownames(ec_num)[ec_num$diffexpressed != "NO"]
ec_num$delabel[ec_num$effect > 0.65 | ec_num$effect < -0.6] <- rownames(ec_num)[ec_num$effect > 0.65 | ec_num$effect < -0.65]
ec_volcano <- ggplot(data=ec_num, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)") +
geom_text_repel(aes(label = delabel), size = 3, max.overlaps = 22)
ec_volcano
## Warning: Removed 1646 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
x.all.ko$effect<-x.all.ko$effect * -1
ko_num<-x.all.ko
#colours based on significance!
# add a column
ko_num$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
ko_num$diffexpressed[ko_num$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
ko_num$diffexpressed[ko_num$effect < -0.5] <- "DOWN"
#labels based on significance!
ko_num$delabel <- NA
#ko_num$delabel[ko_num$diffexpressed != "NO"] <- rownames(ko_num)[ko_num$diffexpressed != "NO"]
ko_num$delabel[ko_num$effect > 0.65 | ko_num$effect < -0.6] <- rownames(ko_num)[ko_num$effect > 0.65 | ko_num$effect < -0.65]
## Warning in ko_num$delabel[ko_num$effect > 0.65 | ko_num$effect < -0.6] <-
## rownames(ko_num)[ko_num$effect > : number of items to replace is not a multiple
## of replacement length
ko_volcano <- ggplot(data=ko_num, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)") +
geom_text_repel(aes(label = delabel), size = 3, max.overlaps = 22)
ko_volcano
## Warning: Removed 5271 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
# relative abundance
p.f <- apply(p, 2, function(x) {x/sum(x)})
colSums(p.f) #check all sum to 1
## 001_post 001_pre 002_post 002_pre 003_post 004_pre 005_post
## 1 1 1 1 1 1 1
## 005_pre 006_post 006_pre 007_post 007_pre 008_post 008_pre
## 1 1 1 1 1 1 1
## 009_post 009_pre 010_post 010_pre 011_post 011_pre 012_post
## 1 1 1 1 1 1 1
## 012_pre DNA_BLANK2 PCR_BLANK2
## 1 1 1
to.remove<-c("003_post","004_pre","DNA_BLANK2","PCR_BLANK2")
p.f2<-p.f[,!colnames(p.f) %in% to.remove]
#transpose to samples as rows
p.f3<-t(p.f2)
#samples must be as rows
p.czm <- cmultRepl(p.f3, label=0, method="CZM")
## No. adjusted imputations: 222
p.clr <- t(apply(p.czm, 1, function(x){log(x) - mean(log(x))}))
meta<-read.table("data/metadata.txt", sep="\t", quote="", check.names=F, header=T, row.names=1, comment.char="")
p.clr.m<-merge(p.clr, meta, by=0, all=FALSE)
p.clr.m$participant<-as.factor(p.clr.m$participant)
p1 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[163]], group = participant, colour = participant)) +
geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
labs(title = "PWY-5913", y = "CLR relative abundance") +
scale_x_discrete(labels = c("Pre", "Post")) +
theme_bw() +
annotate("text", x = 1.5, y = 1.25, label = "ES = -0.65", size = 3) +
theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))
p2 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[250]], group = participant, colour = participant)) +
geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
labs(title = "PWY-7328", y = "CLR relative abundance") +
scale_x_discrete(labels = c("Pre", "Post")) +
annotate("text", x = 1.5, y = 0.5, label = "ES = -0.61", size = 3) +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))
p3 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[108]], group = participant, colour = participant)) +
geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
labs(title = "PWY-3781", y = "CLR relative abundance") +
scale_x_discrete(labels = c("Pre", "Post")) +
annotate("text", x = 1.5, y = 0.5, label = "ES = -0.58", size = 3) +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))
p4 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[80]], group = participant, colour = participant)) +
geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
labs(title = "P125-PWY", y = "CLR relative abundance") +
scale_x_discrete(labels = c("Pre", "Post")) +
annotate("text", x = 1.5, y = 0.5, label = "ES = -0.57", size = 3) +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))
p5 <- ggplot(p.clr.m, aes(x = timepoint, y = p.clr.m[[64]], group = participant, colour = participant)) +
geom_point(aes(fill = participant), size = 3, shape = 21, stroke = 1, alpha = 0.5) +
geom_line(aes(group = participant), size = 1) +
scale_color_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
labs(title = "LACTOSECAT-PWY", y = "CLR relative abundance") +
scale_x_discrete(labels = c("Pre", "Post")) +
annotate("text", x = 1.5, y = 0.5, label = "ES = -0.55", size = 3) +
theme_bw() +
theme(axis.title.x = element_blank(), legend.position = "none", plot.title = element_text(size=10))
Make the final figure panel including Figure 3 B-H
layout <- "
AAAACDE
BBBBFG#
"
ec_volcano + ko_volcano + p1 + p2 + p3 + p4 + p5 +
plot_layout(design = layout)
Specify the sample comparison groups:
metR_pre<-c("001_pre","005_pre","007_pre","009_pre","011_pre")
metR_post<-c("001_post","005_post","007_post","009_post","011_post")
metNR_pre<-c("002_pre","006_pre","008_pre","010_pre","012_pre")
metNR_post<-c("002_post","006_post","008_post","010_post","012_post")
Perform an aldex2 t-test and effect size test of un paired pre-intervention R vs NR samples.
First, taxa (SV).
aldex.in<-d[,c(metR_pre, metNR_pre)]
conds<-c(rep("metR_pre", length(metR_pre)), rep("metNR_pre", length(metNR_pre)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
x.all <- data.frame(x.tt, x.effect) #merge the data
tax<-as.data.frame(tax)
x.all.tax.sv.pre <- merge(x.all, dplyr::select(tax, 2:7), by = "row.names", all.y = FALSE)
#write.table(x.all.tax.sv.pre, file="aldex_output/metR_vsNR_preSV2.txt", sep="\t", quote=F, col.names=NA)
Make the volcano plot based on these comparisons.
pre.sv<-x.all.tax.sv.pre
# add a column for labels based on significance
pre.sv$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
pre.sv$diffexpressed[pre.sv$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
pre.sv$diffexpressed[pre.sv$effect < -0.5] <- "DOWN"
pre.sv$delabel <- NA
rownames(pre.sv) <-pre.sv[,1]
pre.sv$delabel[pre.sv$diffexpressed != "NO"] <- rownames(pre.sv)[pre.sv$diffexpressed != "NO"]
pre.sv.v <- ggplot(data=pre.sv, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)")
aldex.in<-d[,c(metR_post, metNR_post)]
conds<-c(rep("metR_post", length(metR_post)), rep("metNR_post", length(metNR_post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
x.all <- data.frame(x.tt, x.effect) #merge the data
tax<-as.data.frame(tax)
x.all.tax.sv.post <- merge(x.all, dplyr::select(tax, 2:7), by = "row.names", all.y = FALSE)
#write.table(x.all.tax.sv.post, file="metR_vsNR_postSV2.txt", sep="\t", quote=F, col.names=NA)
post.sv<-x.all.tax.sv.post
# add a column for labels based on significance
post.sv$diffexpostssed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
post.sv$diffexpostssed[post.sv$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
post.sv$diffexpostssed[post.sv$effect < -0.5] <- "DOWN"
post.sv$delabel <- NA
rownames(post.sv) <-post.sv[,1]
post.sv$delabel[post.sv$diffexpostssed != "NO"] <- rownames(post.sv)[post.sv$diffexpostssed != "NO"]
post.sv.v <- ggplot(data=post.sv, aes(x=effect, y=-log10(wi.ep), col=diffexpostssed)) +
geom_point(aes(alpha = ifelse(diffexpostssed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)")
Repeat with functional EC values.
aldex.in<-ec[,c(metR_pre, metNR_pre)]
conds<-c(rep("metR_pre", length(metR_pre)), rep("metNR_pre", length(metNR_pre)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
pre.ec <- data.frame(x.tt, x.effect) #merge the data
#write.table(pre.ec, file="metR_vsNR_preEC2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
pre.ec$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
pre.ec$diffexpressed[pre.ec$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
pre.ec$diffexpressed[pre.ec$effect < -0.5] <- "DOWN"
pre.ec$delabel <- NA
pre.ec$delabel[pre.ec$diffexpressed != "NO"] <- rownames(pre.ec)[pre.ec$diffexpressed != "NO"]
pre.ec.v <- ggplot(data=pre.ec, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)")
aldex.in<-ec[,c(metR_post, metNR_post)]
conds<-c(rep("metR_post", length(metR_post)), rep("metNR_post", length(metNR_post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
post.ec <- data.frame(x.tt, x.effect) #merge the data
#write.table(post.ec, file="metR_vsNR_postEC2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
post.ec$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
post.ec$diffexpressed[post.ec$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
post.ec$diffexpressed[post.ec$effect < -0.5] <- "DOWN"
post.ec$delabel <- NA
post.ec$delabel[post.ec$diffexpressed != "NO"] <- rownames(post.ec)[post.ec$diffexpressed != "NO"]
post.ec.v <- ggplot(data=post.ec, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)")
Repeat with functional pathways.
aldex.in<-p[,c(metR_pre, metNR_pre)]
conds<-c(rep("metR_pre", length(metR_pre)), rep("metNR_pre", length(metNR_pre)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
pre.paths <- data.frame(x.tt, x.effect) #merge the data
#write.table(pre.paths, file="metR_vsNR_prePaths2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
pre.paths$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
pre.paths$diffexpressed[pre.paths$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
pre.paths$diffexpressed[pre.paths$effect < -0.5] <- "DOWN"
pre.paths$delabel <- NA
pre.paths$delabel[pre.paths$diffexpressed != "NO"] <- rownames(pre.paths)[pre.paths$diffexpressed != "NO"]
pre.paths.v <- ggplot(data=pre.paths, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)")
aldex.in<-p[,c(metR_post, metNR_post)]
conds<-c(rep("metR_post", length(metR_post)), rep("metNR_post", length(metNR_post)))
x <- aldex.clr(aldex.in, conds, mc.samples=128, verbose=TRUE)
x.tt <- aldex.ttest(x, paired.test=TRUE)
x.effect <- aldex.effect(x)
post.paths <- data.frame(x.tt, x.effect) #merge the data
#write.table(post.paths, file="metR_vsNR_postPaths2.txt", sep="\t", quote=F, col.names=NA)
# add a column for labels based on significance
post.paths$diffexpressed <- "NO"
# if log2Foldchange > 0.5 and pvalue < 0.05, set as "UP"
post.paths$diffexpressed[post.paths$effect > 0.5] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
post.paths$diffexpressed[post.paths$effect < -0.5] <- "DOWN"
post.paths$delabel <- NA
post.paths$delabel[post.paths$diffexpressed != "NO"] <- rownames(post.paths)[post.paths$diffexpressed != "NO"]
post.paths.v <- ggplot(data=post.paths, aes(x=effect, y=-log10(wi.ep), col=diffexpressed)) +
geom_point(aes(alpha = ifelse(diffexpressed == "NO", 0.5, 1))) +
theme_bw() +
scale_color_manual(values=c("#5a78b5", "black", "salmon")) +
theme(legend.position="none") +
labs(x="Effect Size", y="Wilcoxon P (-log10)")
Make the figure panel
grid.arrange(pre.sv.v, post.sv.v, pre.ec.v, post.ec.v, pre.paths.v, post.paths.v, nrow=3)
Compare the features (ASVs, ECs, pathways) across timepoints.
# Convert row names to a column called Row.names for .ec and .p files
pre.ec$Row.names <- rownames(pre.ec)
post.ec$Row.names <- rownames(post.ec)
pre.paths$Row.names <- rownames(pre.paths)
post.paths$Row.names <- rownames(post.paths)
# Function to create comparison table
generate_comparison_table <- function(pre, post) {
# Extract negative effect features
pre.neg <- pre$Row.names[pre$effect < -0.5]
post.neg <- post$Row.names[post$effect < -0.5]
# Find common and unique features (negative effect)
common.neg <- intersect(pre.neg, post.neg)
unique.pre.neg <- setdiff(pre.neg, post.neg)
unique.post.neg <- setdiff(post.neg, pre.neg)
# Extract positive effect features
pre.pos <- pre$Row.names[pre$effect > 0.5]
post.pos <- post$Row.names[post$effect > 0.5]
# Find common and unique features (positive effect)
common.pos <- intersect(pre.pos, post.pos)
unique.pre.pos <- setdiff(pre.pos, post.pos)
unique.post.pos <- setdiff(post.pos, pre.pos)
# Determine the maximum number of rows needed
max_length <- max(length(common.neg), length(unique.pre.neg), length(unique.post.neg),
length(common.pos), length(unique.pre.pos), length(unique.post.pos))
# Pad each vector with NA to match max_length
data.frame(
Common_Negative = c(common.neg, rep(NA, max_length - length(common.neg))),
Unique_Pre_Neg = c(unique.pre.neg, rep(NA, max_length - length(unique.pre.neg))),
Unique_Post_Neg = c(unique.post.neg, rep(NA, max_length - length(unique.post.neg))),
Common_Positive = c(common.pos, rep(NA, max_length - length(common.pos))),
Unique_Pre_Pos = c(unique.pre.pos, rep(NA, max_length - length(unique.pre.pos))),
Unique_Post_Pos = c(unique.post.pos, rep(NA, max_length - length(unique.post.pos)))
)
}
# Generate comparison tables
comparison_table_sv <- generate_comparison_table(pre.sv, post.sv)
comparison_table_ec <- generate_comparison_table(pre.ec, post.ec)
comparison_table_paths <- generate_comparison_table(pre.paths, post.paths)
# Save results
#write.table(comparison_table_sv, "RvsNRcomparisons_sv.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(comparison_table_ec, "RvsNRcomparisons_ec.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(comparison_table_paths, "RvsNRcomparisons_paths.txt", sep="\t", row.names=FALSE, quote=FALSE)
Calculate the deltaCLR (CLR post - CLR pre)
d.clr <- as.data.frame(d.clr)
d.clr$id <- rownames(d.clr)
# Split into pre and post tables
d.pre <- d.clr[grepl("_pre$", d.clr$id), ]
d.post <- d.clr[grepl("_post$", d.clr$id), ]
# Extract individual IDs
d.pre$participant <- sub("_pre$", "", d.pre$id)
d.post$participant <- sub("_post$", "", d.post$id)
# Make sure features are only the columns with CLR values
clr.cols <- setdiff(colnames(d.clr), "id")
# Merge pre and post by subject
d.delta <- merge(d.post[, c("participant", clr.cols)],
d.pre[, c("participant", clr.cols)],
by = "participant",
suffixes = c("_post", "_pre"))
# Calculate delta CLR (post - pre)
delta.cols <- paste0(clr.cols, "_delta")
d.delta[delta.cols] <- d.delta[paste0(clr.cols, "_post")] - d.delta[paste0(clr.cols, "_pre")]
# Keep only subject and delta CLR columns
d.delta.clr <- d.delta[, c("participant", delta.cols)]
# Remove leading zeros
d.delta.clr$participant <- sub("^0+", "", d.delta.clr$participant)
# merge with metadata
delta_meta <- merge(meta, d.delta.clr, by = "participant", all = FALSE)
deltas <- c(39:542)
delta_cols <- names(delta_meta)[deltas]
delta_meta <- delta_meta[delta_meta$timepoint!= "2_post",]
#perform wilcoxon test of delta CLR values between R vs NR
#Ensure grouping variable is a factor
delta_meta$metabolic_responder <- as.factor(delta_meta$metabolic_responder)
wilcox_results <- lapply(delta_cols, function(col) {
# Skip non-numeric or zero-variance features
if (!is.numeric(delta_meta[[col]]) ||
length(unique(na.omit(delta_meta[[col]]))) < 2) {
return(NULL)
}
# Wilcoxon test
wt <- wilcox.test(
delta_meta[[col]] ~ delta_meta$metabolic_responder,
exact = FALSE,
na.action = na.omit
)
# Group means
mean_NR <- mean(delta_meta[[col]][delta_meta$metabolic_responder == "NR"], na.rm = TRUE)
mean_R <- mean(delta_meta[[col]][delta_meta$metabolic_responder == "R"], na.rm = TRUE)
data.frame(
feature = col,
mean_NR = mean_NR,
mean_R = mean_R,
W = wt$statistic,
p_value = wt$p.value
)
})
wilcox_results <- do.call(rbind, wilcox_results)
# Multiple testing correction
wilcox_results$padj <- p.adjust(wilcox_results$p_value, method = "fdr")
# Order by significance
wilcox_results <- wilcox_results[order(wilcox_results$p_value), ]
meta2 <- meta
meta2$time_responder <- paste(
meta2$timepoint,
meta2$metabolic_responder,
sep = "_"
)
#run maaslin2 based on metabolic responder status
# #fit_data <- Maaslin2(d.czm, meta2, 'maaslin/SV_metresponders_timepoint_interaction2',
# transform = "none",
# normalization = "CLR",
# analysis_method = "LM",
# fixed_effects = c("metabolic_responder","timepoint","time_responder"),
# reference = c("time_responder,1_pre_NR"),
# random_effects = c("participant"),
# standardize = FALSE)
# # samples significant for participant, BP_systolic, tot_cholesterol, LDL
# meta$participant<-as.factor(meta$participant)
#
# #run some correlations with the delta clr
# #QIDS
# # QIDS_percent_res <- data.frame()
# # for (col in delta_cols) {
# # res <- cor.test(delta_meta$QIDS_percent_change, delta_meta[[col]], method = "spearman")
# # res.df <- tidy(res)
# # res.df$SV <- col
# # QIDS_percent_res <- rbind(QIDS_percent_res, res.df)
# # }
# # QIDS_percent_res$BH <- p.adjust(QIDS_percent_res$p.value, method = "BH")
# # SV 23 aka strep salivarius negatively associated with qids percent change aka higher strep with improving depression symptomology!!
#
# # test QIDS correlation to metabolic changes
# post1 <- c(27:36)
# post2 <- names(delta_meta_post)[post1]
#
# # QIDS_metabolic <- data.frame()
# # for (metric in post2){
# # res <- cor.test(delta_meta$QIDS_percent_change, delta_meta[[metric]], method = "spearman")
# # res.df <- tidy(res)
# # res.df$metric <- metric
# # QIDS_metabolic <- rbind(QIDS_metabolic, res.df)
# # }
# # QIDS_metabolic$BH <- p.adjust(QIDS_metabolic$p.value, method = "BH") #no significant correlations between QIDS and any of the 'post' metabolic changes
#
# # OASIS_percent_res <- data.frame()
# # for (col in delta_cols) {
# # res <- cor.test(delta_meta$OASIS_percent_change, delta_meta[[col]], method = "spearman")
# # res.df <- tidy(res)
# # res.df$SV <- col
# # OASIS_percent_res <- rbind(OASIS_percent_res, res.df)
# # }
# # OASIS_percent_res$BH <- p.adjust(OASIS_percent_res$p.value, method = "BH")
# # #nothing significant
#
# # OASIS_metabolic <- data.frame()
# # for (metric in post2){
# # res <- cor.test(delta_meta$OASIS_percent_change, delta_meta[[metric]], method = "spearman")
# # res.df <- tidy(res)
# # res.df$metric <- metric
# # OASIS_metabolic <- rbind(OASIS_metabolic, res.df)
# # }
# # OASIS_metabolic$BH <- p.adjust(OASIS_metabolic$p.value, method = "BH") #no significant correlations between OASIS and any of the 'post' metabolic changes
#
# delta_meta_pre <- subset(delta_meta, timepoint == "1_pre")
# delta_meta_post <- subset(delta_meta, timepoint == "2_post")
#
# #baseline correlations
# bl1 <- c(9, 18:26)
# bl2 <- names(delta_meta_pre)[bl1]
#
# BL_cor <- data.frame()
# for (metric in bl2) {
# for (col in delta_cols) {
# res <- cor.test(delta_meta_pre[[metric]], delta_meta_pre[[col]], method = "spearman")
# res.df <- tidy(res)
# res.df$BaselineMetric <- metric
# res.df$SV <- col
# BL_cor <- rbind(BL_cor, res.df)
# }
# }
#
# BL_cor <- BL_cor %>%
# arrange(BaselineMetric, SV) %>% # <-- stable order
# group_by(BaselineMetric) %>%
# mutate(BH = p.adjust(p.value, method = "BH")) %>%
# ungroup()
# #nothing significant. So baseline vales dont correlate with change in CLR of any SV
#
# #post correlations
# post1 <- c(27:36)
# post2 <- names(delta_meta_post)[post1]
#
# post_cor <- data.frame()
# for (metric in post2) {
# for (col in delta_cols) {
# res <- cor.test(delta_meta_post[[metric]], delta_meta_post[[col]], method = "spearman")
# res.df <- tidy(res)
# res.df$PostMetric <- metric
# res.df$SV <- col
# post_cor <- rbind(post_cor, res.df)
# }
# }
#
# post_cor <- post_cor %>%
# arrange(PostMetric, SV) %>% # <-- stable order
# group_by(PostMetric) %>%
# mutate(BH = p.adjust(p.value, method = "BH")) %>%
# ungroup()
# #nothing significant
# correlate QIDS and OASIS with metabolic changes